From f7486d86420c3b7b189754ce220bb126f01f20a8 Mon Sep 17 00:00:00 2001 From: Aryeh Gregor Date: Thu, 8 Jan 2009 23:59:25 +0000 Subject: [PATCH] Reduce code duplication correctly this time, again The test cases I thought up are at: http://www.mediawiki.org/wiki/User:Simetrical/Id_tests All of them pass with the patch, except for some that fail on current code as well: the ones involving templates, multiply-occurring section headers, or numeric id's (there seems to be a weird bug with those that probably involves string and numeric id's being used in the same array). This is true whether $wgEnforceHtmlIds is on or off. (Actually, the problem with numeric keys doesn't happen with $wgEnforceHtmlIds off, because of course numeric ids aren't allowed then.) --- includes/parser/Parser.php | 150 ++++++++++++++++++------------------- 1 file changed, 73 insertions(+), 77 deletions(-) diff --git a/includes/parser/Parser.php b/includes/parser/Parser.php index 7fcfb90a35..1a4901c392 100644 --- a/includes/parser/Parser.php +++ b/includes/parser/Parser.php @@ -3448,7 +3448,7 @@ class Parser * @private */ function formatHeadings( $text, $isMain=true ) { - global $wgMaxTocLevel, $wgContLang, $wgEnforceHtmlIds; + global $wgMaxTocLevel, $wgContLang; $doNumberHeadings = $this->mOptions->getNumberHeadings(); $showEditLink = $this->mOptions->getEditSection(); @@ -3593,71 +3593,17 @@ class Parser } } - # The safe header is a version of the header text safe to use for links - # Avoid insertion of weird stuff like by expanding the relevant sections - $safeHeadline = $this->mStripState->unstripBoth( $headline ); - - # Remove link placeholders by the link text. - # - # turns into - # link text with suffix - $safeHeadline = $this->replaceLinkHoldersText( $safeHeadline ); - - # Strip out HTML (other than plain and : bug 8393) - $tocline = preg_replace( - array( '#<(?!/?(sup|sub)).*?'.'>#', '#<(/?(sup|sub)).*?'.'>#' ), - array( '', '<$1>'), - $safeHeadline - ); - $tocline = trim( $tocline ); - - # For the anchor, strip out HTML-y stuff period - $safeHeadline = preg_replace( '/<.*?'.'>/', '', $safeHeadline ); - $safeHeadline = trim( $safeHeadline ); - - # Save headline for section edit hint before it's escaped - $headlineHint = $safeHeadline; - - if ( $wgEnforceHtmlIds ) { - $legacyHeadline = false; - $safeHeadline = Sanitizer::escapeId( $safeHeadline, - 'noninitial' ); - } else { - # For reverse compatibility, provide an id that's - # HTML4-compatible, like we used to. - # - # It may be worth noting, academically, that it's possible for - # the legacy anchor to conflict with a non-legacy headline - # anchor on the page. In this case likely the "correct" thing - # would be to either drop the legacy anchors or make sure - # they're numbered first. However, this would require people - # to type in section names like "abc_.D7.93.D7.90.D7.A4" - # manually, so let's not bother worrying about it. - $legacyHeadline = Sanitizer::escapeId( $safeHeadline, - 'noninitial' ); - $safeHeadline = Sanitizer::escapeId( $safeHeadline, 'xml' ); - - if ( $legacyHeadline == $safeHeadline ) { - # No reason to have both (in fact, we can't) - $legacyHeadline = false; - } elseif ( $legacyHeadline != Sanitizer::escapeId( - $legacyHeadline, 'xml' ) ) { - # The legacy id is invalid XML. We used to allow this, but - # there's no reason to do so anymore. Backward - # compatibility will fail slightly in this case, but it's - # no big deal. - $legacyHeadline = false; - } - } + list( $anchor, $legacyAnchor, $tocline, $headlineHint ) = + $this->processHeadingText( $headline ); # HTML names must be case-insensitively unique (bug 10721). FIXME: # Does this apply to Unicode characters? Because we aren't # handling those here. - $arrayKey = strtolower( $safeHeadline ); - if ( $legacyHeadline === false ) { + $arrayKey = strtolower( $anchor ); + if ( $legacyAnchor === false ) { $legacyArrayKey = false; } else { - $legacyArrayKey = strtolower( $legacyHeadline ); + $legacyArrayKey = strtolower( $legacyAnchor ); } # count how many in assoc. array so we can track dupes in anchors @@ -3679,12 +3625,10 @@ class Parser } # Create the anchor for linking from the TOC to the section - $anchor = $safeHeadline; - $legacyAnchor = $legacyHeadline; if ( $refers[$arrayKey] > 1 ) { $anchor .= '_' . $refers[$arrayKey]; } - if ( $legacyHeadline !== false && $refers[$legacyArrayKey] > 1 ) { + if ( $legacyAnchor !== false && $refers[$legacyArrayKey] > 1 ) { $legacyAnchor .= '_' . $refers[$legacyArrayKey]; } if( $enoughToc && ( !isset($wgMaxTocLevel) || $toclevel<$wgMaxTocLevel ) ) { @@ -3756,6 +3700,70 @@ class Parser } } + private function processHeadingText( $headline ) { + global $wgEnforceHtmlIds; + + # The safe header is a version of the header text safe to use for links + # Avoid insertion of weird stuff like by expanding the relevant sections + $safeHeadline = $this->mStripState->unstripBoth( $headline ); + + # Remove link placeholders by the link text. + # + # turns into + # link text with suffix + $safeHeadline = $this->replaceLinkHoldersText( $safeHeadline ); + + # Strip out HTML (other than plain and : bug 8393) + $tocline = preg_replace( + array( '#<(?!/?(sup|sub)).*?'.'>#', '#<(/?(sup|sub)).*?'.'>#' ), + array( '', '<$1>'), + $safeHeadline + ); + $tocline = trim( $tocline ); + + # For the anchor, strip out HTML-y stuff period + $safeHeadline = preg_replace( '/<.*?'.'>/', '', $safeHeadline ); + $safeHeadline = trim( $safeHeadline ); + + # Save headline for section edit hint before it's escaped + $headlineHint = $safeHeadline; + + if ( $wgEnforceHtmlIds ) { + $legacyHeadline = false; + $safeHeadline = Sanitizer::escapeId( $safeHeadline, + 'noninitial' ); + } else { + # For reverse compatibility, provide an id that's + # HTML4-compatible, like we used to. + # + # It may be worth noting, academically, that it's possible for + # the legacy anchor to conflict with a non-legacy headline + # anchor on the page. In this case likely the "correct" thing + # would be to either drop the legacy anchors or make sure + # they're numbered first. However, this would require people + # to type in section names like "abc_.D7.93.D7.90.D7.A4" + # manually, so let's not bother worrying about it. + $legacyHeadline = Sanitizer::escapeId( $safeHeadline, + 'noninitial' ); + $safeHeadline = Sanitizer::escapeId( $safeHeadline, 'xml' ); + + if ( $legacyHeadline == $safeHeadline ) { + # No reason to have both (in fact, we can't) + $legacyHeadline = false; + } elseif ( $legacyHeadline != Sanitizer::escapeId( + $legacyHeadline, 'xml' ) ) { + # The legacy id is invalid XML. We used to allow this, but + # there's no reason to do so anymore. Backward + # compatibility will fail slightly in this case, but it's + # no big deal. + $legacyHeadline = false; + } + } + + return array( $safeHeadline, $legacyHeadline, $tocline, + $headlineHint ); + } + /** * Transform wiki markup when saving a page by doing \r\n -> \n * conversion, substitting signatures, {{subst:}} templates, etc. @@ -4736,21 +4744,9 @@ class Parser * "== Header ==". */ public function guessSectionNameFromWikiText( $text ) { - # Strip out wikitext links(they break the anchor) $text = $this->stripSectionName( $text ); - $headline = Sanitizer::decodeCharReferences( $text ); - # strip out HTML - $headline = StringUtils::delimiterReplace( '<', '>', '', $headline ); - $headline = trim( $headline ); - $sectionanchor = '#' . urlencode( str_replace( ' ', '_', $headline ) ); - $replacearray = array( - '%3A' => ':', - '%' => '.' - ); - return str_replace( - array_keys( $replacearray ), - array_values( $replacearray ), - $sectionanchor ); + list( $text, /* unneeded here */ ) = $this->processHeadingText( $text ); + return "#$text"; } /** -- 2.20.1